/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs: 	pdb_full.dta

Outputs: 	pdb_full_entities.dta

Purpose: 	Generate a dataset unique at the entity level
			Generate new variables specific to this project

*******************************************************************************/

clear all
use "${data_built}pdb_full.dta", clear

* Manually input entity ID if missing (only occurs in cases w/ 1 entity)
assert numEntities == 1 if entityId == .
replace entityId = 1 if entityId == .
isid structureId entityId

* Fill in missing publication years if pub year is missing but PMID is present
replace publicationYear = year(releaseDate) if 								///
publicationYear == . & pubmedId != . 

* Flag the relevant entity-level variables and keep these
local entityVars entityId geneName authorAssignedName taxonomy 				///
	taxonomyId before_pdb* clusterNum*										///
	

	keep pubmedId structureId `entityVars'
	isid structureId entityId
	order structureId `entityVars' pubmedId
	sort structureId entityId
	
* Generate new variables specific to this project
	preserve

	* Drop if missing cluster number
	drop if clusterNum100 == .

	* Number of entities in cluster
	gen temp = 1
	bys clusterNum100: egen clusterCount = sum(temp)
	drop temp
	
	* Number of deposits in cluster within first year, within first two years
	merge 1:1 structureId entityId using "${data_built}pdb_full.dta", 		///
	keepusing(collectionDate depositionDate releaseDate)
	
	assert _merge != 1
	keep if _merge == 3
	drop _merge
	bys clusterNum100: egen firstDeposition = min(depositionDate)
	
	gen depositionMargin = depositionDate - firstDeposition
	
	gen less1YrEntity = (depositionMargin <= 365)
	bys clusterNum100: egen clusterCount1Yr = sum(less1YrEntity)
	
	gen less2YrEntity = (depositionMargin <= 730)
	bys clusterNum100: egen clusterCount2Yr = sum(less2YrEntity)
	
	* Number of deposits after the first year, first two years 
	gen greater1YrEntity = (depositionMargin > 365)
	bys clusterNum100: egen clusterCountPost1Yr = sum(greater1YrEntity)
	
	gen greater2YrEntity = (depositionMargin > 730)
	bys clusterNum100: egen clusterCountPost2Yr = sum(greater2YrEntity)
	
	assert less1YrEntity + greater1YrEntity == 1
	assert less2YrEntity + greater2YrEntity == 1
	assert clusterCount1Yr + clusterCountPost1Yr == clusterCount
	assert clusterCount2Yr + clusterCountPost2Yr == clusterCount
	
	* Number of race, non-race deposits defined by collection vs. release dates
	
	* If collection date is after release date (<1% of observations), 
	* code that collection date as missing
	replace collectionDate = . if collectionDate > releaseDate
	
	* Define difference between collection date anad first release 
	* (note: collection date is frequently missing)
	bys clusterNum100: egen firstRelease = min(releaseDate) 
	gen collectionFirstRelease = collectionDate - firstRelease
	
	* Define an entity as a race entity if collection date <= 
	* first release date 
	gen raceEntity = .
	replace raceEntity = 1 if collectionFirstRelease <= 0 & 				///
		collectionFirstRelease != .
	replace raceEntity = 0 if collectionFirstRelease > 0 & 					///
		collectionFirstRelease != .
	
	gen nonRaceEntity = raceEntity*-1 + 1
	assert raceEntity + nonRaceEntity == 1 if collectionFirstRelease != .
	
	* If collection date is missing, define as race entity if 			
	* release date is within 2 years of first release date 
	replace raceEntity = less2YrEntity if raceEntity == .
	replace nonRaceEntity = greater2YrEntity if nonRaceEntity == .
	assert raceEntity + nonRaceEntity == 1

	bys clusterNum100: egen clusterCountRace = sum(raceEntity)
	bys clusterNum100: egen clusterCountNonRace = sum(nonRaceEntity)
	
	* Flag entities as priority
	bys clusterNum100: egen minRelease = min(releaseDate)
	gen priorityEntity = (releaseDate == minRelease) 

* Merge back all observations (including those missing cluster number)
	keep structureId entityId clusterCount* priorityEntity
	tempfile new_vars
	save `new_vars'
	
	restore

	merge 1:1 structureId entityId using `new_vars'
	assert _merge != 2
	drop _merge

	save "${data_built}pdb_full_entities.dta", replace
